Exploratory Data Analysis

DESCRIPTION

Over 370000 used cars scraped with Scrapy from Ebay-Kleinanzeigen.

Import Libraries:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Any results you write to the current directory are saved as output.

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import math
import scipy.stats as stats

Load the Dataset:

dataset_path = "autos.csv"  # Replace with the actual path to your dataset
df = pd.read_csv(dataset_path, encoding='latin-1')

Initial Data Exploration:

# Display the first few rows of the dataset
print("Head of the dataset:")
print(df.head())

# Check the dimensions of the dataset
print("Dimensions:")
print(df.shape)

# Display the column names
print("Column names:")
print(df.columns)
Head of the dataset:
           dateCrawled                            name  seller offerType  \
0  2016-03-24 11:52:17                      Golf_3_1.6  privat   Angebot   
1  2016-03-24 10:58:45            A5_Sportback_2.7_Tdi  privat   Angebot   
2  2016-03-14 12:52:21  Jeep_Grand_Cherokee_"Overland"  privat   Angebot   
3  2016-03-17 16:54:04             GOLF_4_1_4__3TÜRER  privat   Angebot   
4  2016-03-31 17:25:20  Skoda_Fabia_1.4_TDI_PD_Classic  privat   Angebot   

   price abtest vehicleType  yearOfRegistration    gearbox  powerPS  model  \
0    480   test         NaN                1993    manuell        0   golf   
1  18300   test       coupe                2011    manuell      190    NaN   
2   9800   test         suv                2004  automatik      163  grand   
3   1500   test  kleinwagen                2001    manuell       75   golf   
4   3600   test  kleinwagen                2008    manuell       69  fabia   

   kilometer  monthOfRegistration fuelType       brand notRepairedDamage  \
0     150000                    0   benzin  volkswagen               NaN   
1     125000                    5   diesel        audi                ja   
2     125000                    8   diesel        jeep               NaN   
3     150000                    6   benzin  volkswagen              nein   
4      90000                    7   diesel       skoda              nein   

           dateCreated  nrOfPictures  postalCode             lastSeen  
0  2016-03-24 00:00:00             0       70435  2016-04-07 03:16:57  
1  2016-03-24 00:00:00             0       66954  2016-04-07 01:46:50  
2  2016-03-14 00:00:00             0       90480  2016-04-05 12:47:46  
3  2016-03-17 00:00:00             0       91074  2016-03-17 17:40:17  
4  2016-03-31 00:00:00             0       60437  2016-04-06 10:17:21  
Dimensions:
(371528, 20)
Column names:
Index(['dateCrawled', 'name', 'seller', 'offerType', 'price', 'abtest',
       'vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS', 'model',
       'kilometer', 'monthOfRegistration', 'fuelType', 'brand',
       'notRepairedDamage', 'dateCreated', 'nrOfPictures', 'postalCode',
       'lastSeen'],
      dtype='object')
# Get the summary statistics of the numerical columns
print("Summary statistics:")
df.describe()
Summary statistics:
price yearOfRegistration powerPS kilometer monthOfRegistration nrOfPictures postalCode
count 3.715280e+05 371528.000000 371528.000000 371528.000000 371528.000000 371528.0 371528.00000
mean 1.729514e+04 2004.577997 115.549477 125618.688228 5.734445 0.0 50820.66764
std 3.587954e+06 92.866598 192.139578 40112.337051 3.712412 0.0 25799.08247
min 0.000000e+00 1000.000000 0.000000 5000.000000 0.000000 0.0 1067.00000
25% 1.150000e+03 1999.000000 70.000000 125000.000000 3.000000 0.0 30459.00000
50% 2.950000e+03 2003.000000 105.000000 150000.000000 6.000000 0.0 49610.00000
75% 7.200000e+03 2008.000000 150.000000 150000.000000 9.000000 0.0 71546.00000
max 2.147484e+09 9999.000000 20000.000000 150000.000000 12.000000 0.0 99998.00000
# Check the data types of each column
print("Data types:")
print(df.dtypes) 
Data types:
dateCrawled            object
name                   object
seller                 object
offerType              object
price                   int64
abtest                 object
vehicleType            object
yearOfRegistration      int64
gearbox                object
powerPS                 int64
model                  object
kilometer               int64
monthOfRegistration     int64
fuelType               object
brand                  object
notRepairedDamage      object
dateCreated            object
nrOfPictures            int64
postalCode              int64
lastSeen               object
dtype: object

Data Cleaning:

Clean the dataset by handling missing values, removing irrelevant columns, and converting data types if needed.

# Check for missing values
print("Missing values:")
df.isnull().sum()
Missing values:
dateCrawled                0
name                       0
seller                     0
offerType                  0
price                      0
abtest                     0
vehicleType            37869
yearOfRegistration         0
gearbox                20209
powerPS                    0
model                  20484
kilometer                  0
monthOfRegistration        0
fuelType               33386
brand                      0
notRepairedDamage      72060
dateCreated                0
nrOfPictures               0
postalCode                 0
lastSeen                   0
dtype: int64
# Calculate the percentage of null values in each column
null_percentage = (df.isnull().sum() / len(df)) * 100
print(null_percentage)
dateCrawled             0.000000
name                    0.000000
seller                  0.000000
offerType               0.000000
price                   0.000000
abtest                  0.000000
vehicleType            10.192771
yearOfRegistration      0.000000
gearbox                 5.439429
powerPS                 0.000000
model                   5.513447
kilometer               0.000000
monthOfRegistration     0.000000
fuelType                8.986133
brand                   0.000000
notRepairedDamage      19.395577
dateCreated             0.000000
nrOfPictures            0.000000
postalCode              0.000000
lastSeen                0.000000
dtype: float64
Making neccessary data type changes and adding columns
# Convert the date column to datetime format
date_column = "dateCrawled"  # Replace with the actual name of your date column
df[date_column] = pd.to_datetime(df[date_column])
# Create separate date and time columns
df['date'] = df[date_column].dt.date
df['time'] = df[date_column].dt.time

# change datatype from object to date time
df['date'] = pd.to_datetime(df['date'])
df['time'] = pd.to_datetime(df['time'],format='%H:%M:%S').dt.strftime('%H:%M:%S')
# Change the data type of column 'A' to integer
df['postalCode'] = df['postalCode'].astype(object)
# Remove null values in a specific column about 10% of the data
column_with_null = ['vehicleType','model','fuelType']  # Replace with the actual column name
df_without_nulls = df.dropna(subset= column_with_null)
df_without_nulls.isnull().sum()
dateCrawled                0
name                       0
seller                     0
offerType                  0
price                      0
abtest                     0
vehicleType                0
yearOfRegistration         0
gearbox                 7258
powerPS                    0
model                      0
kilometer                  0
monthOfRegistration        0
fuelType                   0
brand                      0
notRepairedDamage      41748
dateCreated                0
nrOfPictures               0
postalCode                 0
lastSeen                   0
date                       0
time                       0
dtype: int64
# gearbox 

# Replace null values with "NA" in a specific column for no information collected
column_of_interest = 'gearbox'  # Replace with the actual column name
df_without_nulls[column_of_interest].fillna("NA", inplace=True)
/var/folders/pf/1lj5fzks0wl1x9ydm5s588q00000gn/T/ipykernel_4297/1978238027.py:5: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

# notRepairedDamage : if the car has a damage which is not repaired yet

# Replace null values with "NA" in a specific column for no information collected
column_of_interest = 'notRepairedDamage'  # Replace with the actual column name
df_without_nulls[column_of_interest].fillna("NA", inplace=True)
/var/folders/pf/1lj5fzks0wl1x9ydm5s588q00000gn/T/ipykernel_4297/3712268988.py:5: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

# Count of repaired Damage
df_without_nulls['notRepairedDamage'].value_counts()
nein    236725
NA       41748
ja       28649
Name: notRepairedDamage, dtype: int64
df_without_nulls.isnull().sum()
dateCrawled            0
name                   0
seller                 0
offerType              0
price                  0
abtest                 0
vehicleType            0
yearOfRegistration     0
gearbox                0
powerPS                0
model                  0
kilometer              0
monthOfRegistration    0
fuelType               0
brand                  0
notRepairedDamage      0
dateCreated            0
nrOfPictures           0
postalCode             0
lastSeen               0
date                   0
time                   0
dtype: int64
# df_without_nulls
# # Replace invalid values with NaN
# df.loc[df["yearOfRegistration"] == 1111, "yearOfRegistration"] = np.nan

# Convert yearOfRegistration to datetime format
df_without_nulls["yearOfRegistration"] = pd.to_datetime(df_without_nulls["yearOfRegistration"], format="%Y")
/var/folders/pf/1lj5fzks0wl1x9ydm5s588q00000gn/T/ipykernel_4297/2823635449.py:6: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

# Drop irrelevant columns
# df_without_nulls = df_without_nulls['date', 'time', 'name', 'seller', 'offerType', 'price', 'abtest',
#        'vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS', 'model',
#        'kilometer', 'monthOfRegistration', 'fuelType', 'brand',
#        'notRepairedDamage', 'dateCreated', 'nrOfPictures', 'postalCode' ]


df_without_nulls = df_without_nulls.drop(['dateCrawled', 'lastSeen' ], axis=1)
df_without_nulls = df_without_nulls[['date', 'time', 'name', 'seller', 'offerType', 'price', 'abtest',
       'vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS', 'model',
       'kilometer', 'monthOfRegistration', 'fuelType', 'brand',
       'notRepairedDamage', 'dateCreated', 'nrOfPictures', 'postalCode' ]]
df_without_nulls
date time name seller offerType price abtest vehicleType yearOfRegistration gearbox powerPS model kilometer monthOfRegistration fuelType brand notRepairedDamage dateCreated nrOfPictures postalCode
2 2016-03-14 12:52:21 Jeep_Grand_Cherokee_"Overland" privat Angebot 9800 test suv 2004-01-01 automatik 163 grand 125000 8 diesel jeep NA 2016-03-14 00:00:00 0 90480
3 2016-03-17 16:54:04 GOLF_4_1_4__3TÜRER privat Angebot 1500 test kleinwagen 2001-01-01 manuell 75 golf 150000 6 benzin volkswagen nein 2016-03-17 00:00:00 0 91074
4 2016-03-31 17:25:20 Skoda_Fabia_1.4_TDI_PD_Classic privat Angebot 3600 test kleinwagen 2008-01-01 manuell 69 fabia 90000 7 diesel skoda nein 2016-03-31 00:00:00 0 60437
5 2016-04-04 17:36:23 BMW_316i___e36_Limousine___Bastlerfahrzeug__Ex... privat Angebot 650 test limousine 1995-01-01 manuell 102 3er 150000 10 benzin bmw ja 2016-04-04 00:00:00 0 33775
6 2016-04-01 20:48:51 Peugeot_206_CC_110_Platinum privat Angebot 2200 test cabrio 2004-01-01 manuell 109 2_reihe 150000 8 benzin peugeot nein 2016-04-01 00:00:00 0 67112
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
371521 2016-03-27 20:36:20 Opel_Zafira_1.6_Elegance_TÜV_12/16 privat Angebot 1150 control bus 2000-01-01 manuell 0 zafira 150000 3 benzin opel nein 2016-03-27 00:00:00 0 26624
371524 2016-03-05 19:56:21 Smart_smart_leistungssteigerung_100ps privat Angebot 1199 test cabrio 2000-01-01 automatik 101 fortwo 125000 3 benzin smart nein 2016-03-05 00:00:00 0 26135
371525 2016-03-19 18:57:12 Volkswagen_Multivan_T4_TDI_7DC_UY2 privat Angebot 9200 test bus 1996-01-01 manuell 102 transporter 150000 3 diesel volkswagen nein 2016-03-19 00:00:00 0 87439
371526 2016-03-20 19:41:08 VW_Golf_Kombi_1_9l_TDI privat Angebot 3400 test kombi 2002-01-01 manuell 100 golf 150000 6 diesel volkswagen NA 2016-03-20 00:00:00 0 40764
371527 2016-03-07 19:39:19 BMW_M135i_vollausgestattet_NP_52.720____Euro privat Angebot 28990 control limousine 2013-01-01 manuell 320 m_reihe 50000 8 benzin bmw nein 2016-03-07 00:00:00 0 73326

307122 rows × 20 columns

from translate import Translator
import pandas as pd

# Function to translate a list of unique words from German to English
def translate_list(unique_words):
    translator = Translator(to_lang='en', from_lang='de')
    translations = [translator.translate(word) for word in unique_words]
    return translations


# Define the unique words in German
seller = ['privat', 'gewerblich']
offerType = ['Angebot', 'Gesuch']
vehicleType = ['suv', 'kleinwagen', 'limousine', 'cabrio', 'bus', 'kombi', 'coupe', 'andere']
gearbox = ['automatik', 'manuell', 'NA']
fuelType = ['diesel', 'benzin', 'lpg', 'andere', 'hybrid', 'cng', 'elektro']
notRepairedDamage = ['NA', 'nein', 'ja']

# Translate the unique words from German to English
translated_seller = translate_list(seller)
translated_offerType = translate_list(offerType)
translated_vehicleType = translate_list(vehicleType)
translated_gearbox = translate_list(gearbox)
translated_fuelType = translate_list(fuelType)
translated_notRepairedDamage = translate_list(notRepairedDamage)

# Create the translation dictionaries
seller_dict = dict(zip(seller, translated_seller))
offerType_dict = dict(zip(offerType, translated_offerType))
vehicleType_dict = dict(zip(vehicleType, translated_vehicleType))
gearbox_dict = dict(zip(gearbox, translated_gearbox))
fuelType_dict = dict(zip(fuelType, translated_fuelType))
notRepairedDamage_dict = dict(zip(notRepairedDamage, translated_notRepairedDamage))


df = df_without_nulls

# Replace German words with English using the translation dictionaries
df['seller'] = df['seller'].map(seller_dict)
df['offerType'] = df['offerType'].map(offerType_dict)
df['vehicleType'] = df['vehicleType'].map(vehicleType_dict)
df['gearbox'] = df['gearbox'].map(gearbox_dict)
df['fuelType'] = df['fuelType'].map(fuelType_dict)
df['notRepairedDamage'] = df['notRepairedDamage'].map(notRepairedDamage_dict)

# Print the updated DataFrame
df
date time name seller offerType price abtest vehicleType yearOfRegistration gearbox powerPS model kilometer monthOfRegistration fuelType brand notRepairedDamage dateCreated nrOfPictures postalCode
2 2016-03-14 12:52:21 Jeep_Grand_Cherokee_"Overland" private Offer 9800 test SUV 2004-01-01 Automatic 163 grand 125000 8 Diesel jeep NA 2016-03-14 00:00:00 0 90480
3 2016-03-17 16:54:04 GOLF_4_1_4__3TÜRER private Offer 1500 test Small car 2001-01-01 Manual 75 golf 150000 6 Gasoline volkswagen No 2016-03-17 00:00:00 0 91074
4 2016-03-31 17:25:20 Skoda_Fabia_1.4_TDI_PD_Classic private Offer 3600 test Small car 2008-01-01 Manual 69 fabia 90000 7 Diesel skoda No 2016-03-31 00:00:00 0 60437
5 2016-04-04 17:36:23 BMW_316i___e36_Limousine___Bastlerfahrzeug__Ex... private Offer 650 test Limousine 1995-01-01 Manual 102 3er 150000 10 Gasoline bmw Yes 2016-04-04 00:00:00 0 33775
6 2016-04-01 20:48:51 Peugeot_206_CC_110_Platinum private Offer 2200 test Convertible 2004-01-01 Manual 109 2_reihe 150000 8 Gasoline peugeot No 2016-04-01 00:00:00 0 67112
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
371521 2016-03-27 20:36:20 Opel_Zafira_1.6_Elegance_TÜV_12/16 private Offer 1150 control Bus 2000-01-01 Manual 0 zafira 150000 3 Gasoline opel No 2016-03-27 00:00:00 0 26624
371524 2016-03-05 19:56:21 Smart_smart_leistungssteigerung_100ps private Offer 1199 test Convertible 2000-01-01 Automatic 101 fortwo 125000 3 Gasoline smart No 2016-03-05 00:00:00 0 26135
371525 2016-03-19 18:57:12 Volkswagen_Multivan_T4_TDI_7DC_UY2 private Offer 9200 test Bus 1996-01-01 Manual 102 transporter 150000 3 Diesel volkswagen No 2016-03-19 00:00:00 0 87439
371526 2016-03-20 19:41:08 VW_Golf_Kombi_1_9l_TDI private Offer 3400 test Combi 2002-01-01 Manual 100 golf 150000 6 Diesel volkswagen NA 2016-03-20 00:00:00 0 40764
371527 2016-03-07 19:39:19 BMW_M135i_vollausgestattet_NP_52.720____Euro private Offer 28990 control Limousine 2013-01-01 Manual 320 m_reihe 50000 8 Gasoline bmw No 2016-03-07 00:00:00 0 73326

307122 rows × 20 columns

Data Exploration Analysis

Descriptive Statistics:

Calculate summary statistics and gain insights into the distribution of numerical variables.

df["age"] = df["date"].dt.year - df["yearOfRegistration"].dt.year
# Get the summary statistics of numerical columns
print("Summary statistics:")
print(df.describe())
Summary statistics:
              price        powerPS      kilometer  monthOfRegistration  \
count  3.071220e+05  307122.000000  307122.000000        307122.000000   
mean   9.404320e+03     121.801932  125391.180052             6.067527   
std    5.192426e+05     171.919845   39347.308065             3.551136   
min    0.000000e+00       0.000000    5000.000000             0.000000   
25%    1.399000e+03      75.000000  100000.000000             3.000000   
50%    3.450000e+03     110.000000  150000.000000             6.000000   
75%    7.980000e+03     150.000000  150000.000000             9.000000   
max    1.000000e+08   20000.000000  150000.000000            12.000000   

       nrOfPictures            age  
count      307122.0  307122.000000  
mean            0.0      13.096294  
std             0.0       6.597030  
min             0.0      -2.000000  
25%             0.0       9.000000  
50%             0.0      13.000000  
75%             0.0      17.000000  
max             0.0     106.000000  

IQR Method

remove outliers from the "price" column in your DataFrame,

you can use statistical methods such as Z-score or interquartile range (IQR)

to identify and filter out the outlier values.

# Calculate the IQR
Q1 = df["price"].quantile(0.25)
Q3 = df["price"].quantile(0.75)
IQR = Q3 - Q1

# Define the lower and upper bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter the DataFrame to exclude outliers
df_filtered = df[(df["price"] >= lower_bound) & (df["price"] <= upper_bound)]
# save filtered dataset 
filename = 'filteredData.csv'
df_filtered.to_csv(filename, header = True, index=False)
plt.figure(figsize=(8, 6))
sns.histplot(df_filtered["price"], bins=50)  # Adjust the number of bins as needed
plt.title("Distribution of Price")
plt.xlabel("Price")
plt.ylabel("Count")
plt.show()
<Figure size 800x600 with 0 Axes>
<Axes: xlabel='price', ylabel='Count'>
Text(0.5, 1.0, 'Distribution of Price')
Text(0.5, 0, 'Price')
Text(0, 0.5, 'Count')

Correlation metrix table

As per the analysis, there is a 0.55 correlation between YearOfRegistration and Price of Car sales on Ebay

def calculate_correlation_matrix(data):
    # Filter out non-numerical columns
    numeric_columns = data.select_dtypes(include='number')

    # Calculate the correlation matrix
    correlation_matrix = numeric_columns.corr()

    return correlation_matrix

def correlation_matrix_to_table(correlation_matrix):
    # Convert the correlation matrix to a DataFrame
    correlation_table = pd.DataFrame(correlation_matrix)

    # Reset the index and rename the columns
    correlation_table = correlation_table.reset_index().rename(columns={'index': 'Variable'})

    return correlation_table


# Calculate the correlation matrix
correlation_matrix = calculate_correlation_matrix(df_filtered)
correlation_matrix_to_table(correlation_matrix)
# correlation_matrix = data.corr()
print("Correlation Matrix:")
correlation_matrix
Variable price powerPS kilometer monthOfRegistration nrOfPictures age
0 price 1.000000 0.160853 -0.358620 0.063203 NaN -0.545362
1 powerPS 0.160853 1.000000 0.027274 0.019554 NaN -0.057651
2 kilometer -0.358620 0.027274 1.000000 -0.007415 NaN 0.301899
3 monthOfRegistration 0.063203 0.019554 -0.007415 1.000000 NaN -0.062286
4 nrOfPictures NaN NaN NaN NaN NaN NaN
5 age -0.545362 -0.057651 0.301899 -0.062286 NaN 1.000000
Correlation Matrix:
price powerPS kilometer monthOfRegistration nrOfPictures age
price 1.000000 0.160853 -0.358620 0.063203 NaN -0.545362
powerPS 0.160853 1.000000 0.027274 0.019554 NaN -0.057651
kilometer -0.358620 0.027274 1.000000 -0.007415 NaN 0.301899
monthOfRegistration 0.063203 0.019554 -0.007415 1.000000 NaN -0.062286
nrOfPictures NaN NaN NaN NaN NaN NaN
age -0.545362 -0.057651 0.301899 -0.062286 NaN 1.000000
# Heatmap of correlations
print("Correlation Heatmap:")
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")

# Generate scatter plot matrix
pd.plotting.scatter_matrix(df_filtered, figsize=(10, 10))

# Show the descriptive statistics table
print(stats)

# Show the generated plots
plt.show()
Correlation Heatmap:
<Axes: >
/Users/admin/anaconda3/lib/python3.10/site-packages/pandas/plotting/_matplotlib/misc.py:101: UserWarning:

Attempting to set identical low and high xlims makes transformation singular; automatically expanding.

/Users/admin/anaconda3/lib/python3.10/site-packages/pandas/plotting/_matplotlib/misc.py:102: UserWarning:

Attempting to set identical low and high ylims makes transformation singular; automatically expanding.

/Users/admin/anaconda3/lib/python3.10/site-packages/pandas/plotting/_matplotlib/misc.py:92: UserWarning:

Attempting to set identical low and high xlims makes transformation singular; automatically expanding.

array([[<Axes: xlabel='price', ylabel='price'>,
        <Axes: xlabel='powerPS', ylabel='price'>,
        <Axes: xlabel='kilometer', ylabel='price'>,
        <Axes: xlabel='monthOfRegistration', ylabel='price'>,
        <Axes: xlabel='nrOfPictures', ylabel='price'>,
        <Axes: xlabel='age', ylabel='price'>],
       [<Axes: xlabel='price', ylabel='powerPS'>,
        <Axes: xlabel='powerPS', ylabel='powerPS'>,
        <Axes: xlabel='kilometer', ylabel='powerPS'>,
        <Axes: xlabel='monthOfRegistration', ylabel='powerPS'>,
        <Axes: xlabel='nrOfPictures', ylabel='powerPS'>,
        <Axes: xlabel='age', ylabel='powerPS'>],
       [<Axes: xlabel='price', ylabel='kilometer'>,
        <Axes: xlabel='powerPS', ylabel='kilometer'>,
        <Axes: xlabel='kilometer', ylabel='kilometer'>,
        <Axes: xlabel='monthOfRegistration', ylabel='kilometer'>,
        <Axes: xlabel='nrOfPictures', ylabel='kilometer'>,
        <Axes: xlabel='age', ylabel='kilometer'>],
       [<Axes: xlabel='price', ylabel='monthOfRegistration'>,
        <Axes: xlabel='powerPS', ylabel='monthOfRegistration'>,
        <Axes: xlabel='kilometer', ylabel='monthOfRegistration'>,
        <Axes: xlabel='monthOfRegistration', ylabel='monthOfRegistration'>,
        <Axes: xlabel='nrOfPictures', ylabel='monthOfRegistration'>,
        <Axes: xlabel='age', ylabel='monthOfRegistration'>],
       [<Axes: xlabel='price', ylabel='nrOfPictures'>,
        <Axes: xlabel='powerPS', ylabel='nrOfPictures'>,
        <Axes: xlabel='kilometer', ylabel='nrOfPictures'>,
        <Axes: xlabel='monthOfRegistration', ylabel='nrOfPictures'>,
        <Axes: xlabel='nrOfPictures', ylabel='nrOfPictures'>,
        <Axes: xlabel='age', ylabel='nrOfPictures'>],
       [<Axes: xlabel='price', ylabel='age'>,
        <Axes: xlabel='powerPS', ylabel='age'>,
        <Axes: xlabel='kilometer', ylabel='age'>,
        <Axes: xlabel='monthOfRegistration', ylabel='age'>,
        <Axes: xlabel='nrOfPictures', ylabel='age'>,
        <Axes: xlabel='age', ylabel='age'>]], dtype=object)
<module 'scipy.stats' from '/Users/admin/anaconda3/lib/python3.10/site-packages/scipy/stats/__init__.py'>

# correlation btn price and age of car  (No. of years )
# Create scatter plot
plt.figure(figsize=(8, 6))
sns.scatterplot(x=df_filtered["age"], y=df_filtered["price"])
plt.title("Age of Car vs. Price")
plt.xlabel("Age")
plt.ylabel("Price")
plt.show()
<Figure size 800x600 with 0 Axes>
<Axes: xlabel='age', ylabel='price'>
Text(0.5, 1.0, 'Age of Car vs. Price')
Text(0.5, 0, 'Age')
Text(0, 0.5, 'Price')

# Analyze the distribution of a price vs age of car 
plt.figure(figsize=(8, 6))
sns.histplot(df_filtered["age"])
plt.title("Distribution of Price")
plt.xlabel("age")
plt.ylabel("price")
plt.show()
<Figure size 800x600 with 0 Axes>
<Axes: xlabel='age', ylabel='Count'>
Text(0.5, 1.0, 'Distribution of Price')
Text(0.5, 0, 'age')
Text(0, 0.5, 'price')

## TOP 10 MODELS

# Group the data by model and calculate the average price for each model
model_prices = df_filtered.groupby("model")["price"].mean().reset_index()

# Sort the data by price in descending order
model_prices = model_prices.sort_values("price", ascending=False)

# Select the top 10 models
top_10_models = model_prices.head(10)

# Set the figure size
plt.figure(figsize=(12, 8))

# Create the bar chart
plt.bar(top_10_models["model"], top_10_models["price"], color="b")

# Customize the chart
plt.title("Average Price for Top 10 Models")
plt.xlabel("Model")
plt.ylabel("Price")
plt.xticks(rotation=90)  # Rotate x-axis labels for better readability

# Show the chart
plt.show()
<Figure size 1200x800 with 0 Axes>
<BarContainer object of 10 artists>
Text(0.5, 1.0, 'Average Price for Top 10 Models')
Text(0.5, 0, 'Model')
Text(0, 0.5, 'Price')
([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [Text(0, 0, 'q7'),
  Text(1, 0, 'glk'),
  Text(2, 0, 'a5'),
  Text(3, 0, 'q5'),
  Text(4, 0, 'range_rover_sport'),
  Text(5, 0, 'cc'),
  Text(6, 0, 'a1'),
  Text(7, 0, 'tiguan'),
  Text(8, 0, 'yeti'),
  Text(9, 0, 'kuga')])


## TOP 10 Brands

# Group the data by model and calculate the average price for each model
brand_prices = df_filtered.groupby("brand")["price"].mean().reset_index()

# Sort the data by price in descending order
brand_prices = brand_prices.sort_values("price", ascending=False)

# Select the top 10 models
top_10_brands = brand_prices.head(10)

# Set the figure size
plt.figure(figsize=(12, 8))

# Create the bar chart
plt.bar(top_10_brands["brand"], top_10_brands["price"], color="b")

# Customize the chart
plt.title("Average Price for Top 10 Brands")
plt.xlabel("Brands")
plt.ylabel("Price")
plt.xticks(rotation=90)  # Rotate x-axis labels for better readability

# Show the chart
plt.show()
<Figure size 1200x800 with 0 Axes>
<BarContainer object of 10 artists>
Text(0.5, 1.0, 'Average Price for Top 10 Brands')
Text(0.5, 0, 'Brands')
Text(0, 0.5, 'Price')
([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [Text(0, 0, 'porsche'),
  Text(1, 0, 'mini'),
  Text(2, 0, 'land_rover'),
  Text(3, 0, 'jeep'),
  Text(4, 0, 'jaguar'),
  Text(5, 0, 'audi'),
  Text(6, 0, 'bmw'),
  Text(7, 0, 'skoda'),
  Text(8, 0, 'dacia'),
  Text(9, 0, 'mercedes_benz')])

import pandas as pd
import plotly.graph_objects as go

# Calculate the frequency counts for brands and models
top_10_brands = df_filtered["brand"].value_counts().nlargest(10).index
top_10_models = df_filtered["model"].value_counts().nlargest(10).index

# Filter the data for the top 10 brands
df_top_10_brands = df_filtered[df_filtered["brand"].isin(top_10_brands)]

# Calculate the frequency counts for models within each brand
top_10_models_per_brand = df_top_10_brands.groupby("brand")["model"].value_counts().groupby(level=0).nlargest(10)

# Create a hierarchical list of labels for the pie chart
labels = []
parents = []
values = []

# Add top 10 brand labels
for brand in top_10_brands:
    labels.append(brand)
    parents.append("")
    values.append(top_10_models_per_brand[brand].sum())

# Add top 10 model labels under each brand
for brand in top_10_brands:
    for model in top_10_models_per_brand[brand].index:
        labels.append(model)
        parents.append(brand)
        values.append(top_10_models_per_brand[brand][model])

# Create the pie chart figure using plotly
fig = go.Figure(go.Sunburst(
    labels=labels,
    parents=parents,
    values=values,
))

# Update the layout of the chart
fig.update_layout(
    title="Top 10 Brands and Top 10 Models by Count",
    sunburstcolorway=["#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52"],
    width=800,
    height=600
)

# Show the interactive pie chart
# fig.show()
df = df_filtered
import pandas as pd
import plotly.express as px

# Filter and select the top 10 brands
top_10_brands = df["brand"].value_counts().nlargest(10).index
df_top_10 = df[df["brand"].isin(top_10_brands)]

# Create a treemap chart using plotly express
fig = px.treemap(df_top_10, path=['brand', 'model'], values='price')

# Configure the layout of the chart
fig.update_layout(
    title="Price Distribution by Top 10 Brands and Models",
    margin=dict(t=30, l=0, r=0, b=0),
)


# Convert the date column to datetime format
df['date'] = pd.to_datetime(df['date'])

# Group the data by date and count the number of occurrences
date_count = df.groupby('date').size().reset_index(name='count')

# Create a line plot using plotly express
fig = px.line(date_count, x='date', y='count', title='Trend of Cars Posted')

# Configure the layout of the chart
fig.update_layout(
    xaxis=dict(title='Date'),
    yaxis=dict(title='Count'),
)
/var/folders/pf/1lj5fzks0wl1x9ydm5s588q00000gn/T/ipykernel_4297/2298839269.py:2: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

import pandas as pd
import plotly.express as px


# Create a treemap chart using plotly express
fig = px.treemap(df, path=['seller', 'offerType', 'gearbox', 'fuelType', 'notRepairedDamage'],
                 color='gearbox')

# Configure the layout of the chart
fig.update_layout(
    title="Relationship between Seller, Offer Type, Gearbox, Fuel Type, and Damage Status",
    margin=dict(t=30, l=0, r=0, b=0),
)
# Number of vehicle types 

import pandas as pd
import plotly.express as px


# Create a word cloud graph using Plotly Express
fig = px.histogram(df, x="vehicleType", title="Vehicle Type by Count", width=800, height=500)

# Configure the graph as a word cloud
fig.update_traces(marker=dict(color='skyblue'), selector=dict(type='bar'))
fig.update_layout(barmode='stack', xaxis=dict(categoryorder='total descending'))
# MOST POPULAR CAR MODELS
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import plotly.graph_objects as go


# Calculate the count of each vehicle type
vehicle_type_counts = df['model'].value_counts()

# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(vehicle_type_counts)

# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Vehicle Model Word Cloud')
<Figure size 1000x500 with 0 Axes>
<matplotlib.image.AxesImage at 0x7f7ee614de70>
(-0.5, 799.5, 399.5, -0.5)
Text(0.5, 1.0, 'Vehicle Model Word Cloud')

# Count unique values of a categorical variable
brand_counts = df["brand"].value_counts()

# Sort the counts in descending order
brand_counts_sorted = brand_counts.sort_values(ascending=False)

# Create a bar plot of a categorical variable
plt.figure(figsize=(10, 6))
sns.countplot(x="brand", data=df, order=brand_counts_sorted.index)
plt.title("Distribution of Car Brands")
plt.xlabel("Brand")
plt.ylabel("Count")
plt.xticks(rotation=90)  # Rotate the x-axis labels by 90 degrees
plt.show()
<Figure size 1000x600 with 0 Axes>
<Axes: xlabel='brand', ylabel='count'>
Text(0.5, 1.0, 'Distribution of Car Brands')
Text(0.5, 0, 'Brand')
Text(0, 0.5, 'Count')
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38]),
 [Text(0, 0, 'volkswagen'),
  Text(1, 0, 'opel'),
  Text(2, 0, 'bmw'),
  Text(3, 0, 'mercedes_benz'),
  Text(4, 0, 'audi'),
  Text(5, 0, 'ford'),
  Text(6, 0, 'renault'),
  Text(7, 0, 'peugeot'),
  Text(8, 0, 'fiat'),
  Text(9, 0, 'seat'),
  Text(10, 0, 'skoda'),
  Text(11, 0, 'mazda'),
  Text(12, 0, 'smart'),
  Text(13, 0, 'citroen'),
  Text(14, 0, 'nissan'),
  Text(15, 0, 'toyota'),
  Text(16, 0, 'hyundai'),
  Text(17, 0, 'volvo'),
  Text(18, 0, 'mini'),
  Text(19, 0, 'mitsubishi'),
  Text(20, 0, 'honda'),
  Text(21, 0, 'kia'),
  Text(22, 0, 'suzuki'),
  Text(23, 0, 'alfa_romeo'),
  Text(24, 0, 'chevrolet'),
  Text(25, 0, 'chrysler'),
  Text(26, 0, 'dacia'),
  Text(27, 0, 'daihatsu'),
  Text(28, 0, 'subaru'),
  Text(29, 0, 'jeep'),
  Text(30, 0, 'porsche'),
  Text(31, 0, 'land_rover'),
  Text(32, 0, 'saab'),
  Text(33, 0, 'jaguar'),
  Text(34, 0, 'daewoo'),
  Text(35, 0, 'lancia'),
  Text(36, 0, 'rover'),
  Text(37, 0, 'trabant'),
  Text(38, 0, 'lada')])

# Count unique values of a categorical variable
model_counts = df["model"].value_counts()

# Sort the counts in descending order
model_counts_sorted = model_counts.sort_values(ascending=False)

# Select the top 20 values
model_counts_top20 = model_counts_sorted[:20]

# Create a bar plot of the top 20 categorical variable values
plt.figure(figsize=(10, 6))
sns.countplot(x="model", data=df, order=model_counts_top20.index)
plt.title("Distribution of Car models (Top 20)")
plt.xlabel("model")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()
<Figure size 1000x600 with 0 Axes>
<Axes: xlabel='model', ylabel='count'>
Text(0.5, 1.0, 'Distribution of Car models (Top 20)')
Text(0.5, 0, 'model')
Text(0, 0.5, 'Count')
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19]),
 [Text(0, 0, 'golf'),
  Text(1, 0, 'andere'),
  Text(2, 0, '3er'),
  Text(3, 0, 'polo'),
  Text(4, 0, 'corsa'),
  Text(5, 0, 'astra'),
  Text(6, 0, 'passat'),
  Text(7, 0, 'a4'),
  Text(8, 0, 'c_klasse'),
  Text(9, 0, '5er'),
  Text(10, 0, 'e_klasse'),
  Text(11, 0, 'a3'),
  Text(12, 0, 'focus'),
  Text(13, 0, 'a6'),
  Text(14, 0, 'fiesta'),
  Text(15, 0, '2_reihe'),
  Text(16, 0, 'transporter'),
  Text(17, 0, 'twingo'),
  Text(18, 0, 'fortwo'),
  Text(19, 0, 'vectra')])

# Calculate the mean price for each model
model_prices = df.groupby("model")["price"].mean()

# Sort the mean prices in descending order
model_prices_sorted = model_prices.sort_values(ascending=False)

# Select the top 20 models by price
top_20_models = model_prices_sorted[:20]

# Create a bar plot of the top 20 models by price
plt.figure(figsize=(10, 6))
sns.barplot(x=top_20_models.index, y=top_20_models.values)
plt.title("Top 20 Car Models by Price")
plt.xlabel("Model")
plt.ylabel("Average Price")
plt.xticks(rotation=90)
plt.show()
<Figure size 1000x600 with 0 Axes>
<Axes: xlabel='model'>
Text(0.5, 1.0, 'Top 20 Car Models by Price')
Text(0.5, 0, 'Model')
Text(0, 0.5, 'Average Price')
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19]),
 [Text(0, 0, 'q7'),
  Text(1, 0, 'glk'),
  Text(2, 0, 'a5'),
  Text(3, 0, 'q5'),
  Text(4, 0, 'range_rover_sport'),
  Text(5, 0, 'cc'),
  Text(6, 0, 'a1'),
  Text(7, 0, 'tiguan'),
  Text(8, 0, 'yeti'),
  Text(9, 0, 'kuga'),
  Text(10, 0, 'serie_3'),
  Text(11, 0, 'v60'),
  Text(12, 0, 'range_rover_evoque'),
  Text(13, 0, 'boxster'),
  Text(14, 0, 'cayenne'),
  Text(15, 0, 'qashqai'),
  Text(16, 0, 'b_max'),
  Text(17, 0, 'cx_reihe'),
  Text(18, 0, 'sl'),
  Text(19, 0, 'defender')])

# Calculate the mean price for each brand
brand_prices = df.groupby("brand")["price"].mean()

# Sort the mean prices in descending order
brand_prices_sorted = brand_prices.sort_values(ascending=False)

# Select the top 20 brands by price
top_20_brands = brand_prices_sorted[:20]

# Create a bar plot of the top 20 brands by price
plt.figure(figsize=(10, 6))
sns.barplot(x=top_20_brands.index, y=top_20_brands.values)
plt.title("Top 20 Car brands by Price")
plt.xlabel("brand")
plt.ylabel("Average Price")
plt.xticks(rotation=90)
plt.show()
<Figure size 1000x600 with 0 Axes>
<Axes: xlabel='brand'>
Text(0.5, 1.0, 'Top 20 Car brands by Price')
Text(0.5, 0, 'brand')
Text(0, 0.5, 'Average Price')
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19]),
 [Text(0, 0, 'porsche'),
  Text(1, 0, 'mini'),
  Text(2, 0, 'land_rover'),
  Text(3, 0, 'jeep'),
  Text(4, 0, 'jaguar'),
  Text(5, 0, 'audi'),
  Text(6, 0, 'bmw'),
  Text(7, 0, 'skoda'),
  Text(8, 0, 'dacia'),
  Text(9, 0, 'mercedes_benz'),
  Text(10, 0, 'chevrolet'),
  Text(11, 0, 'hyundai'),
  Text(12, 0, 'kia'),
  Text(13, 0, 'toyota'),
  Text(14, 0, 'volkswagen'),
  Text(15, 0, 'volvo'),
  Text(16, 0, 'nissan'),
  Text(17, 0, 'seat'),
  Text(18, 0, 'suzuki'),
  Text(19, 0, 'honda')])

import plotly.express as px

# Count unique values of the vehicleType variable
vehicle_type_counts = df["vehicleType"].value_counts()

# Create a pie chart of the vehicleType variable
fig = px.pie(vehicle_type_counts, values=vehicle_type_counts.values, names=vehicle_type_counts.index)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(title="Distribution of Vehicle Types by Count")
import plotly.express as px

# Calculate the average price for each vehicle type
avg_price_by_type = df.groupby("vehicleType")["price"].mean().reset_index()

# Create a pie chart of the vehicleType distribution by average price
fig = px.pie(avg_price_by_type, values="price", names="vehicleType")
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(title="Distribution of Vehicle Types by Average Price")
fig.show()
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# Convert 'date' and 'dateCreated' columns to datetime
df['date'] = pd.to_datetime(df['date'])
df['dateCreated'] = pd.to_datetime(df['dateCreated'])

# Extract relevant date components if needed (e.g., year, month)
df['yearOfRegistration'] = pd.to_datetime(df['yearOfRegistration'], format='%Y', errors='coerce').dt.year

# Drop columns that are not required for modeling
df = df.drop(['date', 'time', 'name', 'seller', 'offerType', 'abtest', 'dateCreated', 'nrOfPictures', 'postalCode'], axis=1)

# Convert categorical variables to numerical using label encoding
label_encoder = LabelEncoder()
categorical_cols = ['vehicleType', 'gearbox', 'model', 'fuelType', 'brand', 'notRepairedDamage']
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col].astype(str))
/var/folders/pf/1lj5fzks0wl1x9ydm5s588q00000gn/T/ipykernel_4297/2980117876.py:2: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

/var/folders/pf/1lj5fzks0wl1x9ydm5s588q00000gn/T/ipykernel_4297/2980117876.py:3: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

/var/folders/pf/1lj5fzks0wl1x9ydm5s588q00000gn/T/ipykernel_4297/2980117876.py:6: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

X = df.drop('price', axis=1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize the models
linear_reg = LinearRegression()
random_forest = RandomForestRegressor()
xgb_reg = XGBRegressor()

# Train the models
linear_reg.fit(X_train, y_train)
random_forest.fit(X_train, y_train)
xgb_reg.fit(X_train, y_train)
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# Make predictions
linear_reg_preds = linear_reg.predict(X_test)
random_forest_preds = random_forest.predict(X_test)
xgb_reg_preds = xgb_reg.predict(X_test)

# Calculate mean squared error (MSE)
linear_reg_mse = mean_squared_error(y_test, linear_reg_preds)
random_forest_mse = mean_squared_error(y_test, random_forest_preds)
xgb_reg_mse = mean_squared_error(y_test, xgb_reg_preds)

# Print the MSE for each model
print("Linear Regression MSE:", linear_reg_mse)
print("Random Forest MSE:", random_forest_mse)
print("XGBoost MSE:", xgb_reg_mse)

# Select the model with the lowest MSE
best_model = min(linear_reg_mse, random_forest_mse, xgb_reg_mse)
if best_model == linear_reg_mse:
    selected_model = linear_reg
    model_name = "Linear Regression"
elif best_model == random_forest_mse:
    selected_model = random_forest
    model_name = "Random Forest"
else:
    selected_model = xgb_reg
    model_name = "XGBoost"

print("Best Model:", model_name)
Linear Regression MSE: 9761443.63885572
Random Forest MSE: 2386370.046739888
XGBoost MSE: 2441435.7550129006
Best Model: Random Forest
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Define the sample data
sample_data = {
    'vehicleType': ['limousine'],
    'yearOfRegistration': [2010],
    'gearbox': ['automatik'],
    'powerPS': [150],
    'model': ['Audi A6'],
    'kilometer': [100000],
    'monthOfRegistration': [6],
    'fuelType': ['benzin'],
    'brand': ['Audi'],
    'notRepairedDamage': ['nein'],
    'age': [11]
}

# Create a DataFrame from the sample data
sample_df = pd.DataFrame(sample_data)

# Convert categorical variables to numerical using label encoding
label_encoder = LabelEncoder()
categorical_cols = ['vehicleType', 'gearbox', 'model', 'fuelType', 'brand', 'notRepairedDamage']
for col in categorical_cols:
    sample_df[col] = label_encoder.fit_transform(sample_df[col].astype(str))

# Use the best model for price prediction
predicted_price = selected_model.predict(sample_df)

print("Predicted Price:", predicted_price[0])
Predicted Price: 7746.22